In tangchao7498/DecodeR: Barcode demultiplexing of native RNAs nanopore sequencing

knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE, fig.align = 'center')

library(data.table)
library(caret)
library(RColorBrewer)
library(pheatmap)

Accuracy validation

load(file = "/mnt/raid5/Personal/tangchao/project/Nanopore/BarcodeDecomplex/analysis/06.DecodingDeePlexiConAndPoreplex/00.ModelTraining2/01.Model/DeePlexiCon_Classifier.RData")
TestData <- readRDS("/mnt/raid5/Personal/tangchao/project/Nanopore/BarcodeDecomplex/analysis/06.DecodingDeePlexiConAndPoreplex/00.ModelTraining2/00.RData/DeePlexiCon_TestSet.Rds")

Fit1

ROC_Tab <- data.frame(obs = TestData$Class, 
                      predict(Fit1, TestData, type = "prob"), 
                      pred = predict(Fit1, newdata = TestData))

postResample(pred = ROC_Tab$pred, obs = ROC_Tab$obs)

(confM <- confusionMatrix(data = ROC_Tab$pred, reference = ROC_Tab$obs))

confP <- apply(confM$table, 2, function(x) x/sum(x))

pheatmap(confP * 100, 
         cluster_rows = F, 
         cluster_cols = F, 
         display_numbers = T, 
         fontsize = 15, 
         number_color = "red", 
         legend = FALSE, 
         color = colorRampPalette(brewer.pal(n = 9, name ="Blues"))(100))

pdf("/mnt/raid5/Personal/tangchao/project/Nanopore/BarcodeDecomplex/analysis/06.DecodingDeePlexiConAndPoreplex/01.Comparison/Our_DeePlexiCon_confusionMatrix.pdf", width = 4, height = 4)
pheatmap(confP * 100, 
         cluster_rows = F, 
         cluster_cols = F, 
         display_numbers = T, 
         fontsize = 15, 
         number_color = "red", 
         legend = FALSE, 
         color = colorRampPalette(brewer.pal(n = 9, name ="Blues"))(100))
dev.off()

lapply(seq(1, 99, 1)/100, function(i) {
  ReadsPercent <- mean(apply(ROC_Tab[, 3:ncol(ROC_Tab) - 1], 1, max) >= i)*100
  ROC_Tab_PPT <- ROC_Tab[apply(ROC_Tab[, 3:ncol(ROC_Tab) - 1], 1, max) > i, ]
  Accu <- postResample(pred = ROC_Tab_PPT$pred, obs = ROC_Tab_PPT$obs)[1]
  data.frame(ReadsPercent = ReadsPercent, Accuracy = Accu)
}) -> Cutoff_Select
Cutoff_Select <- do.call(rbind, Cutoff_Select)
Cutoff_Select$Cutoff <- seq(1, 99, 1)/100

library(ggplot2)
ggplot(Cutoff_Select, aes(ReadsPercent, Accuracy)) + 
  geom_line() + 
  scale_x_reverse() + 
  labs(y = "Accuracy", x = "Percentage of classified reads") + 
  theme_classic(base_size = 15) + 
  theme(legend.position = "none", 
        axis.title = element_text(size = 16), 
        axis.text = element_text(size = 12)) + 
  geom_hline(yintercept = 0.99)

table(Fit1$trainingData$.outcome)

DeePlexiCon

R2BC <- readRDS("/mnt/raid5/Personal/tangchao/project/Nanopore/BarcodeDecomplex/analysis/06.DecodingDeePlexiConAndPoreplex/00.ModelTraining/00.RData/DeePlexiCon_TestSet_Meta.Rds")

deeplexicon_output <- fread("/mnt/raid62/BetaCoV/Person/tangchao/analysis/DrictRNA/deeplexicon/conda/output1.tsv")
deeplexicon_output[, ReadID := paste0("read_", ReadID)]
deeplexicon_pred <- melt(deeplexicon_output[, .(ReadID, P_bc_1, P_bc_2, P_bc_3, P_bc_4)])[, .(pred = variable[which.max(value)]), ReadID]
deeplexicon_output <- merge(deeplexicon_output, deeplexicon_pred, by = "ReadID")
deeplexicon_output <- merge(deeplexicon_output, R2BC[, .(read, Name, gene)], by.x = "ReadID", by.y = "read")
setnames(deeplexicon_output, "Name", "obs")

deeplexicon_output[, pred := plyr::mapvalues(x = pred, from = c("P_bc_1", "P_bc_2", "P_bc_3", "P_bc_4"), to = c("D-BC1", "D-BC2", "D-BC3", "D-BC4"))]
deeplexicon_output[, Barcode := plyr::mapvalues(x = Barcode, from = c("bc_1", "bc_2", "bc_3", "bc_4"), to = c("D-BC1", "D-BC2", "D-BC3", "D-BC4"))]

table(deeplexicon_output$pred)

mean(deeplexicon_output$pred != "unknown")

deeplexicon_output[, Barcode := factor(Barcode, levels = c("D-BC1", "D-BC2", "D-BC3", "D-BC4", "unknown"))]
deeplexicon_output[, obs := factor(obs, levels = c("D-BC1", "D-BC2", "D-BC3", "D-BC4", "unknown"))]

postResample(pred = deeplexicon_output$pred, obs = deeplexicon_output$obs)
mean(deeplexicon_output$pred == deeplexicon_output$obs)

mean(deeplexicon_output[pred != "unknown", ]$pred == deeplexicon_output[pred != "unknown", ]$obs)

(confM2 <- confusionMatrix(data = deeplexicon_output$Barcode, reference = deeplexicon_output$obs))

confP2 <- apply(confM2$table, 2, function(x) x/sum(x))

pheatmap(confP2[, 1:4] * 100, 
         cluster_rows = F, 
         cluster_cols = F, 
         display_numbers = T, 
         fontsize = 15, 
         number_color = "red", 
         legend = FALSE, 
         color = colorRampPalette(brewer.pal(n = 9, name ="Blues"))(100))

pdf("/mnt/raid5/Personal/tangchao/project/Nanopore/BarcodeDecomplex/analysis/06.DecodingDeePlexiConAndPoreplex/01.Comparison/DeePlexiCon_confusionMatrix.pdf", width = 4, height = 4)
pheatmap(confP2[, 1:4] * 100, 
         cluster_rows = F, 
         cluster_cols = F, 
         display_numbers = T, 
         fontsize = 15, 
         number_color = "red", 
         legend = FALSE, 
         color = colorRampPalette(brewer.pal(n = 9, name ="Blues"))(100))
dev.off()

ROC_Tab2 <- data.frame(deeplexicon_output[, .(obs, P_bc_1, P_bc_2, P_bc_3, P_bc_4, pred)], row.names = deeplexicon_output[, ReadID])

lapply(seq(1, 99, 1)/100, function(i) {
  ReadsPercent <- mean(apply(ROC_Tab2[, 3:ncol(ROC_Tab2) - 1], 1, max) >= i)*100
  ROC_Tab_PPT <- ROC_Tab2[apply(ROC_Tab2[, 3:ncol(ROC_Tab2) - 1], 1, max) > i, ]
  Accu <- postResample(pred = ROC_Tab_PPT$pred, obs = ROC_Tab_PPT$obs)[1]
  data.frame(ReadsPercent = ReadsPercent, Accuracy = Accu)
}) -> Cutoff_Select2
Cutoff_Select2 <- do.call(rbind, Cutoff_Select2)
Cutoff_Select2$Cutoff <- seq(1, 99, 1)/100

ggplot(Cutoff_Select2, aes(ReadsPercent, Accuracy)) + 
  geom_line() + 
  scale_x_reverse() + 
  labs(y = "Accuracy", x = "Percentage of classified reads") + 
  theme_classic(base_size = 15) + 
  theme(legend.position = "none", 
        axis.title = element_text(size = 16), 
        axis.text = element_text(size = 12)) + 
  geom_hline(yintercept = 0.99)

Cutoff_Tab <- rbind(data.table(Method = "My", Cutoff_Select), data.table(Method = "DeePlexiCon", Cutoff_Select2))

ggplot(Cutoff_Tab, aes(ReadsPercent, Accuracy, colour = Method)) + 
  geom_line() + 
  scale_x_reverse() + 
  labs(y = "Accuracy", x = "Percentage of classified reads") + 
  theme_classic(base_size = 15) + 
  theme(legend.position = "none", 
        axis.title = element_text(size = 16), 
        axis.text = element_text(size = 12)) + 
  geom_hline(yintercept = 0.99)

setwd("/mnt/raid5/Personal/tangchao/project/Nanopore/BarcodeDecomplex/analysis/06.DecodingDeePlexiConAndPoreplex/01.Comparison")
saveRDS(confM, "Our_Method_confusionMatrix_Vs_DeePlexiCon.Rds")
saveRDS(confM2, "DeePlexiCon_confusionMatrix.Rds")
saveRDS(Cutoff_Tab, "DeePlexiCon_CutOff_List.Rds")